library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(Rtsne)
library(knitr)
library(ROCR)
library(expss)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load and prepare data

Load dataset (preprocessing code in 19_10_11_create_dataset.Rmd)

clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'), row.names=1)

Gene filtering:

  • Remove genes without cluster (Module=gray)

  • Remove genes with Module-Trait correlation missing

rm_cluster = dataset[is.na(dataset$MTcor),clustering_selected] %>% unique %>% as.character

print(paste0('Removing ', sum(dataset[,clustering_selected]=='gray'), ' genes without cluster'))
## [1] "Removing 178 genes without cluster"
print(paste0('Removing ', sum(is.na(dataset$MTcor)), ' genes belonging to module(s) without module-trait correlation: ',
             rm_cluster))
## [1] "Removing 1555 genes belonging to module(s) without module-trait correlation: #00BD64"
new_dataset = dataset %>% filter(dataset[,clustering_selected]!='gray' & !is.na(MTcor))

Variable changes:

  • Using Module Membership variables instead of binary module membership

  • Not including p-value variables

  • Including a new variable with the absolute value of GS

  • Removing information from gray module (unclassified genes) and any other module that did not have a Module-Trait value

  • Objective variable: Binary label indicating if it’s in the SFARI dataset or not (including score 6)

new_dataset = new_dataset %>% dplyr::select(-c(matches(paste('pval', clustering_selected,
                                               gsub('#','',rm_cluster), sep='|')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=ifelse(gene.score=='None', FALSE, TRUE)) %>%
              dplyr::select(-gene.score)

rownames(new_dataset) = rownames(dataset)[!is.na(dataset$MTcor) & dataset[,clustering_selected]!='gray']

rm(rm_cluster)
original_dataset = dataset
dataset = new_dataset
print(paste0('The final dataset contains ', nrow(dataset), ' observations and ', ncol(dataset), ' variables.'))
## [1] "The final dataset contains 14867 observations and 38 variables."
rm(new_dataset)

Sample of the dataset

dataset %>% head %>% kable
MTcor GS MM.00C1A8 MM.4B9FFF MM.8195FF MM.C17FFF MM.C49A00 MM.D29300 MM.FF64B2 MM.FF699D MM.FB61D7 MM.00C0BB MM.53B400 MM.E88523 MM.00B0F6 MM.F365E6 MM.FD6F86 MM.00BBDD MM.00B70C MM.75AF00 MM.DE8C00 MM.B4A000 MM.F8766D MM.00B6EB MM.00BF7D MM.D774FD MM.A3A500 MM.F17E4F MM.00C094 MM.00A8FF MM.00BECD MM.A58AFF MM.E76BF3 MM.FF61C5 MM.00BB45 MM.8EAB00 absGS SFARI
ENSG00000000003 0.7601073 0.3827131 -0.3477130 -0.1449705 -0.1646246 -0.2829441 -0.3416614 -0.1752186 -0.1947862 0.0280602 -0.1139808 -0.1800823 -0.2265495 0.0824399 0.1910086 -0.0125141 -0.0148751 -0.1126751 -0.0522299 0.1702235 -0.0318971 0.5014482 0.3716079 0.1920288 0.3141084 0.3318810 0.4772117 0.4106499 0.1092023 0.1433862 0.1295224 -0.0098212 0.3358176 0.1393598 0.1434403 0.2296628 0.3827131 FALSE
ENSG00000000419 0.4150643 0.0361671 -0.0509414 -0.4181807 -0.3435142 -0.0950710 0.1271936 0.3731103 -0.1661217 -0.0683150 0.1835545 -0.2093919 -0.0438694 0.0019727 0.2711652 0.1487585 0.1426498 -0.1856460 -0.1161695 0.0769049 -0.0790248 -0.2102113 -0.1908434 -0.0142087 -0.0346279 -0.2307758 0.0273226 0.1300158 -0.0003214 -0.1399217 0.1449922 0.3286803 0.1418381 0.1323354 0.5218180 0.2613671 0.0361671 FALSE
ENSG00000000457 -0.9216750 -0.4005675 0.1874271 0.2806782 0.1875352 0.3747611 0.3393944 0.2716503 0.3227764 0.1136580 0.1760050 0.1146922 0.3195916 -0.2275923 -0.0641184 -0.0820587 0.0331912 0.1632690 -0.0397420 -0.1388166 -0.0426981 -0.0552223 -0.0629920 -0.1494477 -0.2855344 -0.2286590 -0.3060407 -0.2607013 -0.3293047 -0.1707641 -0.2804889 -0.0996501 -0.2359453 -0.1367952 -0.0969217 -0.2679999 0.4005675 FALSE
ENSG00000000460 -0.1165650 -0.2386668 0.0081068 0.0453039 -0.1677691 0.1659316 0.3632797 0.4758476 0.1413266 0.0073811 0.0232871 -0.0620729 0.0492475 0.1279434 0.2559762 0.1658171 0.2708015 0.0877830 -0.0212876 -0.1033743 -0.0918440 -0.2760330 -0.2148459 -0.1636790 -0.3609935 -0.4116075 -0.4156419 -0.2123721 -0.2596629 -0.3395212 -0.0682352 0.1426534 0.0297760 0.1965458 0.3086452 -0.2279076 0.2386668 FALSE
ENSG00000000971 0.8556942 0.6586707 -0.2759526 -0.3511587 -0.2780262 -0.3940097 -0.4692445 -0.2210057 -0.1077318 0.0231030 -0.0266761 -0.2443301 -0.2726795 0.0025340 0.0112294 0.2300797 -0.1711726 -0.3056525 0.0899908 -0.0559141 0.0961331 0.0052618 -0.1778786 0.2297739 0.1605946 0.2297970 0.4252786 0.2986677 0.8196572 0.4208318 0.5669501 0.0737886 0.1440107 0.2222202 0.2006217 0.0904164 0.6586707 FALSE
ENSG00000001036 0.7601073 0.3221038 -0.1361035 -0.5330410 -0.4449867 -0.5478868 -0.3157576 0.0951756 -0.3301118 -0.2001291 0.1974429 -0.1947359 -0.1959688 0.1137417 0.0583008 0.0552634 0.0148300 -0.4207842 -0.1377539 0.0398521 0.1730645 -0.0030739 0.2322611 0.0643902 -0.0654919 -0.2230414 0.1810246 0.5690748 0.0845592 0.0307338 0.3770384 0.5580772 0.5086895 0.4994903 0.5013863 0.4209643 0.3221038 FALSE

Objective variable distribution: Unbalanced labels

print(table(dataset$SFARI))
## 
## FALSE  TRUE 
## 14039   828
cat(paste0('\n',round(mean(dataset$SFARI)*100,2), '% of the observations are positive'))
## 
## 5.57% of the observations are positive

Visualisations

Visualising the variables

Chose the t-SNE algorithm because it preserves distances

The SFARI labels is still close to the absolute value of Gene Significance. This time the MM variables seem to be grouped in 2 clusters

tsne = dataset %>% t %>% Rtsne(perplexity=10)

plot_data = data.frame('ID'=colnames(dataset), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=type)) + geom_point(aes(id=ID)) + 
         theme_minimal() + ggtitle('t-SNE visualisation of variables'))

The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and modules with low correlation far away from the SFARI tag

mtcor_by_module = original_dataset %>% dplyr::select(matches(clustering_selected), MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=MTcor)) + geom_point(aes(id=ID)) + 
         scale_color_viridis() + theme_minimal() + 
         ggtitle('t-SNE of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, tsne)

Visualising the observations

# Mean Expression data
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.7) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.5) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(aes(alpha=alpha)) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.5) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)

Resampling to reduce class imbalance

For now, will do this using over- and under-sampling of the classes, but later on should check SMOTE (Synthetic Minority Over-sampling Technique) method

Need to divide first into train and test sets to keep the sets independent: using 80% of the Positive observations on the training set

positive_idx = which(dataset$SFARI)
negative_idx = which(!dataset$SFARI)

set.seed(123)
positive_test_idx = sort(sample(positive_idx, size=floor(0.2*length(positive_idx))))
positive_train_idx = positive_idx[!positive_idx %in% positive_test_idx]

set.seed(123)
negative_test_idx = sort(sample(negative_idx, size=floor(0.2*length(positive_idx))))
negative_train_idx = negative_idx[!negative_idx %in% negative_test_idx]

train_set = dataset[c(positive_train_idx,negative_train_idx),]
test_set = dataset[c(positive_test_idx,negative_test_idx),]

rm(positive_idx, negative_idx, positive_train_idx, positive_test_idx, negative_train_idx, negative_test_idx)

Balancing the dataset to obtain a 1:1 ratio in labels

Over-sampling observations with positive SFARI label: Sample with replacement 4x original number of observations

Sample with replacement positive observations in train set

positive_obs = which(train_set$SFARI)

set.seed(123)
add_obs = sample(positive_obs, size=3*length(positive_obs), replace=TRUE)

train_set = train_set[c(1:nrow(train_set), add_obs),]

rm(positive_obs, add_obs)

Under-sampling observations with negative SFARI labels

print(paste0('Keeping ~',round(100*sum(train_set$SFARI)/sum(!train_set$SFARI)),
             '% of the Negative observations in the training set'))
## [1] "Keeping ~19% of the Negative observations in the training set"
negative_obs = which(!train_set$SFARI)
set.seed(123)
remove_obs = sample(negative_obs, size=(sum(!train_set$SFARI)-sum(train_set$SFARI)))

train_set = train_set[-remove_obs,]


rm(negative_obs, remove_obs)

Label distribution in training set

cro(train_set$SFARI)
 #Total 
 train_set$SFARI 
   FALSE  2652
   TRUE  2652
   #Total cases  5304

Labels distribution in test set

cro(test_set$SFARI)
 #Total 
 test_set$SFARI 
   FALSE  165
   TRUE  165
   #Total cases  330

Logistic Regression

Train model

train_set$SFARI = train_set$SFARI %>% as.factor

fit = glm(SFARI~., data=train_set, family='binomial')

Predict labels in test set

test_set$prob = predict(fit, newdata=test_set, type='response')
test_set$pred = test_set$prob>0.5

Performance metrics

Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  101 64   165
   TRUE  64 101   165
   #Total cases  165 165   330
rm(conf_mat)

Accuracy

acc = mean(test_set$SFARI==test_set$pred)

print(paste0('Accuracy = ', round(acc,4)))
## [1] "Accuracy = 0.6121"
rm(acc)

ROC Curve

pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(AUC,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')

Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)

Analyse model

SFARI genes have a slightly higher distribution than the rest

plot_data = test_set %>% dplyr::select(prob, SFARI)

plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + 
              theme_minimal() + ggtitle('Probability distribution by SFARI Label')

  • Scores 1 and 2 have the highest scores!

  • Score 6 has a completely different behaviour to the rest, with a lower distribution even than the unlabelled genes

plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  8
   2  11
   3  34
   4  76
   5  32
   6  4
   None  165
   #Total cases  330
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

plot_data %>% ggplot(aes(prob, color=gene.score, fill=gene.score)) + geom_density(alpha=0.25) + 
              geom_vline(data=mean_vals, aes(xintercept=mean_prob, color=gene.score), linetype='dashed') +
              scale_colour_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('Probability') + ylab('Density') + theme_minimal()

ggplotly(plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal())
rm(mean_vals)
summary(fit)
## 
## Call:
## glm(formula = SFARI ~ ., family = "binomial", data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2492  -1.0532   0.0199   1.0436   2.1933  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.37059    0.05727  -6.471 9.75e-11 ***
## MTcor         0.12633    0.08256   1.530 0.125962    
## GS           -2.76945    1.01602  -2.726 0.006415 ** 
## MM.00C1A8    -0.16808    0.80843  -0.208 0.835303    
## MM.4B9FFF    -2.24179    2.24440  -0.999 0.317873    
## MM.8195FF     0.46242    2.39666   0.193 0.847003    
## MM.C17FFF     8.90924    2.94143   3.029 0.002455 ** 
## MM.C49A00   -11.62208    3.30815  -3.513 0.000443 ***
## MM.D29300    -0.13735    2.28756  -0.060 0.952122    
## MM.FF64B2    -5.58744    1.56680  -3.566 0.000362 ***
## MM.FF699D    -1.71365    0.91484  -1.873 0.061044 .  
## MM.FB61D7    11.05199    2.14017   5.164 2.42e-07 ***
## MM.00C0BB    -4.94969    2.99814  -1.651 0.098755 .  
## MM.53B400     3.76623    3.45902   1.089 0.276236    
## MM.E88523     1.84402    0.68199   2.704 0.006853 ** 
## MM.00B0F6     0.27711    1.21039   0.229 0.818915    
## MM.F365E6    -0.61250    3.31705  -0.185 0.853503    
## MM.FD6F86     8.80736    4.23963   2.077 0.037766 *  
## MM.00BBDD     6.18613    2.22925   2.775 0.005520 ** 
## MM.00B70C    -5.21552    2.99197  -1.743 0.081303 .  
## MM.75AF00    -0.67852    1.12138  -0.605 0.545129    
## MM.DE8C00     0.95878    0.95129   1.008 0.313516    
## MM.B4A000     1.96372    1.67126   1.175 0.239996    
## MM.F8766D    -1.76638    1.46890  -1.203 0.229161    
## MM.00B6EB     3.21914    1.12566   2.860 0.004239 ** 
## MM.00BF7D     3.25031    1.67267   1.943 0.051993 .  
## MM.D774FD     0.94612    2.02591   0.467 0.640491    
## MM.A3A500    -6.20675    2.00338  -3.098 0.001947 ** 
## MM.F17E4F     3.97799    3.19984   1.243 0.213800    
## MM.00C094     5.37684    1.56804   3.429 0.000606 ***
## MM.00A8FF    -3.29108    2.90941  -1.131 0.257977    
## MM.00BECD     1.44690    1.91594   0.755 0.450133    
## MM.A58AFF     1.56584    1.62380   0.964 0.334892    
## MM.E76BF3     2.68760    2.92319   0.919 0.357883    
## MM.FF61C5    -0.94481    1.46724  -0.644 0.519618    
## MM.00BB45    -7.62739    2.58661  -2.949 0.003190 ** 
## MM.8EAB00    -5.44687    1.88788  -2.885 0.003912 ** 
## absGS         0.28926    0.15747   1.837 0.066219 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7352.9  on 5303  degrees of freedom
## Residual deviance: 6614.0  on 5266  degrees of freedom
## AIC: 6690
## 
## Number of Fisher Scoring iterations: 4

There doesn’t seem to be a relation between the modules’ coefficient and their correlation with Diagnosis, although there is correlation between the variables, so it’s not correct to analyse their coefficients when this happens

Also, this results changes a lot between different runs

var_fit_info = summary(fit)$coefficients %>% as.data.frame %>% 
               mutate(signif=`Pr(>|z|)`<0.05, ID=rownames(summary(fit)$coefficients))

plot_data = original_dataset %>% dplyr::select(matches(clustering_selected), MTcor) 
colnames(plot_data) = c('ID','MTcor')

plot_data = plot_data %>% mutate(ID=gsub('#','MM.',ID)) %>% group_by(ID, MTcor) %>% 
            tally %>% inner_join(var_fit_info, by='ID')

ggplotly(plot_data %>% ggplot(aes(Estimate, MTcor, color=signif, size=n)) + geom_point(aes(id=ID)) + 
         geom_smooth(method='lm', se=FALSE) + theme_minimal() +
         xlab('Coefficient in regression') + ylab('Module-Diagnosis correlation'))
rm(var_fit_info)

Strong correlations between variables

cors = cor(train_set[,-ncol(train_set)])
heatmap.2(cors, dendrogram='none', col=brewer.pal(11,'RdBu'), scale='none', trace='none')

rm(cors)

Print genes with highest scores in test set

  • First gene has score=1

  • Not a single gene with score 6

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>% 
             dplyr::select(ID, prob, SFARI, gene.score, MTcor, matches(clustering_selected)) %>%
             kable
ID prob SFARI gene.score MTcor DynamicHybridMergedSmall
ENSG00000118058 0.8772247 TRUE 1 0.2020738 #00B70C
ENSG00000152217 0.8743730 TRUE 3 0.2020738 #00B70C
ENSG00000139767 0.8674337 TRUE 5 -0.5362834 #00C0BB
ENSG00000162105 0.8611231 TRUE 2 -0.5362834 #00C0BB
ENSG00000183117 0.8428797 TRUE 4 -0.9216750 #C17FFF
ENSG00000127124 0.8414151 TRUE 3 0.2020738 #00B70C
ENSG00000170579 0.8389252 TRUE 3 -0.9216750 #C17FFF
ENSG00000197694 0.8388399 FALSE None -0.5362834 #00C0BB
ENSG00000049618 0.8365423 TRUE 1 0.2020738 #00B70C
ENSG00000169057 0.8329187 TRUE 2 -0.6714074 #8195FF
ENSG00000116539 0.8326399 TRUE 1 0.2020738 #00B70C
ENSG00000147010 0.8268095 TRUE 5 0.2020738 #00B70C
ENSG00000125851 0.8227207 FALSE None -0.6557529 #53B400
ENSG00000119866 0.8194519 TRUE 2 -0.5362834 #00C0BB
ENSG00000105409 0.7985797 TRUE 4 -0.5362834 #00C0BB
ENSG00000130477 0.7928487 TRUE 4 -0.9216750 #C17FFF
ENSG00000082701 0.7927872 TRUE 5 -0.5362834 #00C0BB
ENSG00000133958 0.7836568 TRUE 3 0.2020738 #00B70C
ENSG00000133216 0.7812700 TRUE 4 0.4840783 #00B6EB
ENSG00000078018 0.7795482 TRUE 5 -0.5986466 #4B9FFF
ENSG00000074054 0.7725906 TRUE 3 -0.4966667 #00C1A8
ENSG00000197892 0.7718133 TRUE 4 0.2020738 #00B70C
ENSG00000128815 0.7687111 TRUE 3 0.6236224 #00BF7D
ENSG00000172292 0.7646495 FALSE None -0.9216750 #C17FFF
ENSG00000136531 0.7632218 TRUE 1 -0.9216750 #C17FFF
ENSG00000071242 0.7628319 TRUE 4 -0.5362834 #00C0BB
ENSG00000126705 0.7614435 TRUE 3 -0.5362834 #00C0BB
ENSG00000163635 0.7603906 TRUE 5 0.4727632 #F365E6
ENSG00000198752 0.7545511 TRUE 3 -0.5362834 #00C0BB
ENSG00000185345 0.7532615 TRUE 3 -0.9216750 #C17FFF
ENSG00000148737 0.7520793 TRUE 3 -0.5362834 #00C0BB
ENSG00000164061 0.7519603 FALSE None -0.5362834 #00C0BB
ENSG00000121297 0.7482135 TRUE 4 0.2020738 #00B70C
ENSG00000176697 0.7390388 TRUE 5 0.4840783 #00B6EB
ENSG00000100354 0.7378861 TRUE 2 0.4843988 #8EAB00
ENSG00000112640 0.7351863 TRUE 4 -0.6557529 #53B400
ENSG00000139910 0.7307468 FALSE None -0.9216750 #C17FFF
ENSG00000107105 0.7228099 TRUE 4 -0.9216750 #C17FFF
ENSG00000157014 0.7213163 FALSE None -0.5362834 #00C0BB
ENSG00000178568 0.7208969 TRUE 5 -0.5986466 #4B9FFF
ENSG00000170043 0.7178050 FALSE None -0.5362834 #00C0BB
ENSG00000132821 0.7165198 FALSE None -0.6714074 #8195FF
ENSG00000187555 0.7159583 TRUE 2 -0.6557529 #53B400
ENSG00000170485 0.7103425 TRUE 4 -0.5362834 #00C0BB
ENSG00000116254 0.7067786 TRUE 5 -0.5986466 #4B9FFF
ENSG00000137075 0.7033723 TRUE 4 -0.5362834 #00C0BB
ENSG00000132535 0.7028420 TRUE 5 -0.6714074 #8195FF
ENSG00000135365 0.7020656 TRUE 4 0.2020738 #00B70C
ENSG00000166206 0.7005120 TRUE 2 -0.1535426 #FF699D
ENSG00000095787 0.7004685 TRUE 2 -0.1165650 #FD6F86

Negative samples distribution

Running the model on all non-SFARI genes

negative_set = dataset %>% filter(!SFARI) %>% dplyr::select(-SFARI)
rownames(negative_set) = rownames(dataset)[!dataset$SFARI]

negative_set$prob = predict(fit, newdata=negative_set, type='response')
negative_set$pred = negative_set$prob>0.5

negative_set = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                             pred = 'Label Prediction')
cro(negative_set$pred)
 #Total 
 Label Prediction 
   FALSE  9064
   TRUE  4975
   #Total cases  14039
cat(paste0('\n', sum(negative_set$pred), ' genes are predicted as ASD-related'))
## 
## 4975 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#666666', linetype='dashed') + 
                 ggtitle('Probability distribution of all the Negative samples in the dataset') + 
                 theme_minimal()

Probability and Gene Significance

There’s a lot of noise, but the genes with the highest probabilities have slightly higher (absolute) Gene Significance

negative_set %>% ggplot(aes(prob, GS, color=MTcor)) + geom_point() + geom_smooth(method='loess', color='#666666') +
                 geom_hline(yintercept=0, color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()

negative_set %>% ggplot(aes(prob, abs(GS), color=MTcor)) + geom_point() + geom_smooth(method='loess', color='#666666') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Model probability and Gene Significance') + theme_minimal()

Probability and Module-Diagnosis correlation

On average, the genes belonging to the modules with the highest correlation to ASD are assigned a lower probability by the model (???)

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + geom_smooth(method='loess', color='#666666') + 
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()

Summarised version, plotting by module instead of by gene

The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% left_join(original_dataset, by='MTcor') %>%
            dplyr::select(matches(clustering_selected), MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID)) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())

Probability and level of expression

There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias

# Gandal dataset
load('./../Data/Gandal/preprocessed_data_non_standardised_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, matches(clustering_selected)), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              theme_minimal() + ggtitle('Mean expression vs model probability by gene')

rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + geom_point(color=plot_data2$Module) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)

Probability and level of expression

There is also a positive relation between the standard deviation of a gene and its regression score, the model could be capturing this characteristic of the genes to make the prediction, and could be introducing bias

plot_data %>% filter(sdExpr<0.5) %>% ggplot(aes(sdExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.2) + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              theme_minimal() + ggtitle('SD vs model probability by gene')

This approximation curve looks like the opposite of the trend found between mean/sd and model scores

plot_data %>% ggplot(aes(meanExpr, sdExpr)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              scale_x_log10() + scale_y_log10() +
              theme_minimal() + ggtitle('Mean expression vs SD by gene')

Probability and lfc

There is a relation between probability and lfc, so it IS capturing a bit of true information (because lfc and mean expression were negatively correlated and it still has a positive relation in the model)

plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>% 
            left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% filter(abs(log2FoldChange)<10) %>%
              ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              theme_minimal() + ggtitle('lfc vs model probability by gene')



Conclusion

Original conclusion: The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots) …

New conclusion: The model seems to be capturing some sort of confounding variable to make the predictions, it would seem that it’s related to the mean expression or the standard deviation of the genes, but in 10_10_20_data_preprocessing_standardising_expr.RData we standardised the dataset and it made no difference in the resulting clusterings, which means that there is some other behaviour related to the level of expression of a gene or its standard deviation that is biasing the classifier, but I don’t know what it could be or how to fix it …


Session info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] expss_0.9.1        ROCR_1.0-7         gplots_3.0.1      
##  [4] Rtsne_0.15         RColorBrewer_1.1-2 gridExtra_2.3     
##  [7] viridis_0.5.1      viridisLite_0.3.0  plotly_4.8.0      
## [10] knitr_1.22         forcats_0.3.0      stringr_1.4.0     
## [13] dplyr_0.8.0.1      purrr_0.3.1        readr_1.3.1       
## [16] tidyr_0.8.3        tibble_2.1.1       ggplot2_3.1.0     
## [19] tidyverse_1.2.1   
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1            htmlTable_1.11.2           
##   [3] XVector_0.22.0              GenomicRanges_1.34.0       
##   [5] base64enc_0.1-3             rstudioapi_0.7             
##   [7] bit64_0.9-7                 AnnotationDbi_1.44.0       
##   [9] lubridate_1.7.4             xml2_1.2.0                 
##  [11] splines_3.5.2               geneplotter_1.60.0         
##  [13] Formula_1.2-3               jsonlite_1.6               
##  [15] Cairo_1.5-9                 annotate_1.60.1            
##  [17] broom_0.5.1                 cluster_2.0.7-1            
##  [19] shiny_1.2.0                 compiler_3.5.2             
##  [21] httr_1.4.0                  backports_1.1.2            
##  [23] assertthat_0.2.1            Matrix_1.2-15              
##  [25] lazyeval_0.2.2              cli_1.1.0                  
##  [27] later_0.8.0                 acepack_1.4.1              
##  [29] htmltools_0.3.6             tools_3.5.2                
##  [31] gtable_0.2.0                glue_1.3.1                 
##  [33] GenomeInfoDbData_1.2.0      Rcpp_1.0.1                 
##  [35] Biobase_2.42.0              cellranger_1.1.0           
##  [37] gdata_2.18.0                nlme_3.1-137               
##  [39] crosstalk_1.0.0             xfun_0.5                   
##  [41] rvest_0.3.2                 miniUI_0.1.1.1             
##  [43] mime_0.6                    gtools_3.5.0               
##  [45] XML_3.98-1.11               zlibbioc_1.28.0            
##  [47] scales_1.0.0                hms_0.4.2                  
##  [49] promises_1.0.1              parallel_3.5.2             
##  [51] SummarizedExperiment_1.12.0 yaml_2.2.0                 
##  [53] memoise_1.1.0               rpart_4.1-13               
##  [55] ggExtra_0.8                 RSQLite_2.1.1              
##  [57] latticeExtra_0.6-28         stringi_1.4.3              
##  [59] genefilter_1.64.0           highr_0.8                  
##  [61] S4Vectors_0.20.1            checkmate_1.8.5            
##  [63] caTools_1.17.1              BiocGenerics_0.28.0        
##  [65] BiocParallel_1.16.6         GenomeInfoDb_1.18.2        
##  [67] rlang_0.3.2                 pkgconfig_2.0.2            
##  [69] matrixStats_0.54.0          bitops_1.0-6               
##  [71] evaluate_0.13               lattice_0.20-38            
##  [73] htmlwidgets_1.3             labeling_0.3               
##  [75] bit_1.1-14                  tidyselect_0.2.5           
##  [77] plyr_1.8.4                  magrittr_1.5               
##  [79] DESeq2_1.22.2               R6_2.4.0                   
##  [81] IRanges_2.16.0              generics_0.0.2             
##  [83] Hmisc_4.1-1                 DBI_1.0.0                  
##  [85] DelayedArray_0.8.0          pillar_1.3.1               
##  [87] haven_1.1.1                 foreign_0.8-71             
##  [89] withr_2.1.2                 survival_2.43-3            
##  [91] RCurl_1.95-4.10             nnet_7.3-12                
##  [93] modelr_0.1.4                crayon_1.3.4               
##  [95] KernSmooth_2.23-15          rmarkdown_1.12             
##  [97] locfit_1.5-9.1              grid_3.5.2                 
##  [99] readxl_1.1.0                data.table_1.12.0          
## [101] blob_1.1.1                  digest_0.6.18              
## [103] xtable_1.8-3                httpuv_1.5.0               
## [105] stats4_3.5.2                munsell_0.5.0